library(readr)
library(tidyverse)
library(ggplot2)
library(SnowballC)
library(tidytext)
library(stringr)
library(tidyr)
library(lubridate)
library(vistime)
library(scales)
YInt <- read_csv("~/Documents/2021 Spring/SDS235/DC5-Data/Y*Int Social Media Data/YInt.csv")
#filter messages containing KRAK TV and sent by KRAK TV
krak_yint <- YInt %>%
filter(str_detect(YInt$message, pattern = "KRAK TV")) %>%
arrange(time)
origin_krak <- krak_yint %>%
filter(!str_detect(message, pattern = ("re:")))
account_krak <- YInt %>%
filter(account == "KRAKTV")
earth_alert <- YInt %>%
filter(account == "EarthQuakeSeers")
#filter time range to have messages after the earthquake happens
yint_eq <- YInt %>%
filter(time >= ymd_hms("2020-04-08 08:36:00"))
officials_yint <- yint_eq %>%
filter(str_detect(message, pattern = "Department")) %>%
filter(!str_detect(message, pattern = ("re:"))) %>%
arrange(time)
city_eoc <- yint_eq %>%
filter(str_detect(message, pattern = "City EOC")) %>%
filter(!str_detect(message, pattern = ("re:"))) %>%
arrange(time)
tvhostbrad <- yint_eq %>%
filter(str_detect(message, pattern = "@TVHostBrad")) %>%
filter(!str_detect(message, pattern = ("re:"))) %>%
arrange(time)
bridge <- yint_eq %>%
filter(account == "DOT-StHimark")
#compile reliable messages based on our assumptions
reliable_mess <- rbind(account_krak, origin_krak, officials_yint, city_eoc, bridge, earth_alert) %>%
distinct(message, .keep_all = TRUE) %>%
arrange(time)
#export the messages to a csv
write.csv(reliable_mess, "reliable_message.csv")
#a selection of events and a summary column has been made in *tl.csv based on personal discretion
tl <- read_csv("reliable_message_tl.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## time = col_character(),
## location = col_character(),
## account = col_character(),
## message = col_character(),
## event = col_character()
## )
tl <- tl %>%
mutate(time = as.POSIXct(time, format = "%M/%d/%y %H:%M"))
tl <- tl %>%
mutate(date = floor_date(time, unit = "days"))
tl_data_08 <- tl %>%
filter(is.na(event) == FALSE) %>%
filter(date == "2020-04-08") %>%
mutate(time2 = time) %>%
select(time, time2, location, event) %>%
rename(start = time,
end = time2,
group = location)
tl_data_09 <- tl %>%
filter(is.na(event) == FALSE) %>%
filter(date >= as.Date("2020-04-09")) %>%
mutate(time2 = time) %>%
select(time, time2, location, event) %>%
rename(start = time,
end = time2,
group = location)
vistime(tl_data_08, optimize_y = TRUE)
vistime(tl_data_09, optimize_y = TRUE)
text_eq <- yint_eq %>%
mutate(hour = floor_date(time, unit="hours")) %>%
unnest_tokens(word, message) %>%
anti_join(stop_words)
## Joining, by = "word"
sentiment <- text_eq %>%
inner_join(get_sentiments("bing")) %>%
count(location, index = hour, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
sentiment_dt <- sentiment %>%
filter(location == "downtown")
ggplot(sentiment, aes(index, sentiment, fill = location)) +
geom_col() +
facet_wrap(~location, scales = "free_x") +
scale_x_datetime(labels = date_format("%H:%M")) +
theme_minimal() +
theme(legend.title = element_text(size = 5),
legend.text = element_text(size=4))
(The graph could be expanded in rmd window but looks clustered in html; see report for reference)
facet_wrap error: https://stackoverflow.com/questions/66361247/error-with-ggplot-facet-wrap-error-scale-id-must-not-be-na
sentiment dataset: Saif M. Mohammad and Peter Turney. (2013), ``Crowdsourcing a Word-Emotion Association Lexicon.’’ Computational Intelligence, 29(3): 436-465.
sentiment analysis: https://www.tidytextmining.com/sentiment.html
timeline visualization https://cran.r-project.org/web/packages/vistime/vignettes/vistime-vignette.html